/* ------------------------------------------------------------------
 * Copyright (C) 1998-2009 PacketVideo
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 * -------------------------------------------------------------------
 */
/*
------------------------------------------------------------------------------
 INPUT AND OUTPUT DEFINITIONS

 Inputs:
    xpos = x half-pixel of (x,y) coordinates within a VOP; motion
           compensated coordinates; native type
    ypos = y half-pixel of (x,y) coordinates within a VOP; motion
           compensated coordinates; native type
    comp = pointer to 8-bit compensated prediction values within a VOP;
        computed by this module (i/o); full-pel resolution
    c_prev = pointer to previous 8-bit prediction values within a VOP;
          values range from (0-255); full-pel resolution
    sh_d = pointer to residual values used to compensate the predicted
        value; values range from (-512 to 511); full-pel resolution
    width = width of the VOP in pixels (x axis); full-pel resolution
    rnd1 = rounding value for case when one dimension uses half-pel
           resolution
    rnd2 = rounding value for case when two dimensions uses half-pel
           resolution
    CBP = flag indicating whether residual is all zeros
          (0 -> all zeros, 1 -> not all zeros)
        outside_flag = flag indicating whether motion vector is outside the
               VOP (0 -> inside, 1 -> outside)

 Outputs:
    returns 1

 Local Stores/Buffers/Pointers Needed:
    None

 Global Stores/Buffers/Pointers Needed:
    None

 Pointers and Buffers Modified:
    comp = buffer contains newly computed compensated prediction values

 Local Stores Modified:
    None

 Global Stores Modified:
    None

------------------------------------------------------------------------------
 FUNCTION DESCRIPTION

 Compute pixel values for a block in the current VOP. The prediction
 values are generated by averaging pixel values in the previous VOP; the
 block position in the previous frame is computed from the current block's
 motion vector. The computed pixel values are then computed by adding the
 prediction values to the block residual values.


------------------------------------------------------------------------------
*/

/*----------------------------------------------------------------------------
; INCLUDES
----------------------------------------------------------------------------*/
#include "mp4dec_lib.h"
#include "motion_comp.h"

#define OSCL_DISABLE_WARNING_CONV_POSSIBLE_LOSS_OF_DATA

int GetPredAdvancedBy0x0(
    uint8 *prev,        /* i */
    uint8 *pred_block,      /* i */
    int width,      /* i */
    int pred_width_rnd /* i */
)
{
    uint    i;      /* loop variable */
    int offset, offset2;
    uint32  pred_word, word1, word2;
    int tmp;

    /* initialize offset to adjust pixel counter */
    /*    the next row; full-pel resolution      */
    offset = width - B_SIZE; /* offset for prev */
    offset2 = (pred_width_rnd >> 1) - 4; /* offset for pred_block */

    tmp = (uintptr_t)prev & 0x3;
    pred_block -= offset2; /* preset */

    if (tmp == 0)  /* word-aligned */
    {
        for (i = B_SIZE; i > 0; i--)
        {
            *((uint32*)(pred_block += offset2)) = *((uint32*)prev);
            *((uint32*)(pred_block += 4)) = *((uint32*)(prev + 4));
            prev += width;
        }
        return 1;
    }
    else if (tmp == 1) /* first position */
    {
        prev--; /* word-aligned */

        for (i = B_SIZE; i > 0; i--)
        {
            word1 = *((uint32*)prev); /* read 4 bytes, b4 b3 b2 b1 */
            word2 = *((uint32*)(prev += 4));  /* read 4 bytes, b8 b7 b6 b5 */
            word1 >>= 8; /* 0 b4 b3 b2 */
            pred_word = word1 | (word2 << 24);  /* b5 b4 b3 b2 */
            *((uint32*)(pred_block += offset2)) = pred_word;

            word1 = *((uint32*)(prev += 4)); /* b12 b11 b10 b9 */
            word2 >>= 8; /* 0 b8 b7 b6 */
            pred_word = word2 | (word1 << 24); /* b9 b8 b7 b6 */
            *((uint32*)(pred_block += 4)) = pred_word;

            prev += offset;
        }

        return 1;
    }
    else if (tmp == 2) /* second position */
    {
        prev -= 2; /* word1-aligned */

        for (i = B_SIZE; i > 0; i--)
        {
            word1 = *((uint32*)prev); /* read 4 bytes, b4 b3 b2 b1 */
            word2 = *((uint32*)(prev += 4));  /* read 4 bytes, b8 b7 b6 b5 */
            word1 >>= 16; /* 0 0 b4 b3 */
            pred_word = word1 | (word2 << 16);  /* b6 b5 b4 b3 */
            *((uint32*)(pred_block += offset2)) = pred_word;

            word1 = *((uint32*)(prev += 4)); /* b12 b11 b10 b9 */
            word2 >>= 16; /* 0 0 b8 b7 */
            pred_word = word2 | (word1 << 16); /* b10 b9 b8 b7 */
            *((uint32*)(pred_block += 4)) = pred_word;


            prev += offset;
        }

        return 1;
    }
    else /* third position */
    {
        prev -= 3; /* word1-aligned */

        for (i = B_SIZE; i > 0; i--)
        {
            word1 = *((uint32*)prev); /* read 4 bytes, b4 b3 b2 b1 */
            word2 = *((uint32*)(prev += 4));  /* read 4 bytes, b8 b7 b6 b5 */
            word1 >>= 24; /* 0 0 0 b4 */
            pred_word = word1 | (word2 << 8);   /* b7 b6 b5 b4 */
            *((uint32*)(pred_block += offset2)) = pred_word;

            word1 = *((uint32*)(prev += 4)); /* b12 b11 b10 b9 */
            word2 >>= 24; /* 0 0 0 b8 */
            pred_word = word2 | (word1 << 8); /* b11 b10 b9 b8 */
            *((uint32*)(pred_block += 4)) = pred_word;

            prev += offset;
        }

        return 1;
    }
}

/**************************************************************************/
int GetPredAdvancedBy0x1(
    uint8 *prev,        /* i */
    uint8 *pred_block,      /* i */
    int width,      /* i */
    int pred_width_rnd /* i */
)
{
    uint    i;      /* loop variable */
    int offset, offset2;
    uint32 word1, word2, word3, word12;
    int tmp;
    int rnd1;
    uint32 mask;

    /* initialize offset to adjust pixel counter */
    /*    the next row; full-pel resolution      */
    offset = width - B_SIZE; /* offset for prev */
    offset2 = (pred_width_rnd >> 1) - 4; /* offset of pred_block */

    rnd1 = pred_width_rnd & 1;

    /* Branch based on pixel location (half-pel or full-pel) for x and y */
    pred_block -= offset2; /* preset */

    tmp = (uintptr_t)prev & 3;
    mask = 254;
    mask |= (mask << 8);
    mask |= (mask << 16); /* 0xFEFEFEFE */

    if (tmp == 0) /* word-aligned */
    {
        if (rnd1 == 1)
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word1 = *((uint32*)prev); /* b4 b3 b2 b1 */
                word2 = *((uint32*)(prev += 4)); /* b8 b7 b6 b5 */
                word12 = (word1 >> 8); /* 0 b4 b3 b2 */
                word12 |= (word2 << 24); /* b5 b4 b3 b2 */
                word3 = word1 | word12; // rnd1 = 1; otherwise word3 = word1&word12
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word1 >>= 1;
                word1 = word1 + (word12 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += offset2)) = word1; /* write 4 pixels */

                word1 = *((uint32*)(prev += 4)); /* b12 b11 b10 b9 */
                word12 = (word2 >> 8); /* 0 b8 b7 b6 */
                word12 |= (word1 << 24); /* b9 b8 b7 b6 */
                word3 = word2 | word12;
                word2 &= mask;
                word3 &= (~mask);  /* 0x1010101, check last bit */
                word12 &= mask;
                word2 >>= 1;
                word2 = word2 + (word12 >> 1);
                word2 += word3;
                *((uint32*)(pred_block += 4)) = word2; /* write 4 pixels */

                prev += offset;
            }
            return 1;
        }
        else /* rnd1 == 0 */
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word1 = *((uint32*)prev); /* b4 b3 b2 b1 */

                word2 = *((uint32*)(prev += 4)); /* b8 b7 b6 b5 */
                word12 = (word1 >> 8); /* 0 b4 b3 b2 */
                word12 |= (word2 << 24); /* b5 b4 b3 b2 */
                word3 = word1 & word12; // rnd1 = 1; otherwise word3 = word1&word12
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word1 >>= 1;
                word1 = word1 + (word12 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += offset2)) = word1; /* write 4 pixels */

                word1 = *((uint32*)(prev += 4)); /* b12 b11 b10 b9 */
                word12 = (word2 >> 8); /* 0 b8 b7 b6 */
                word12 |= (word1 << 24); /* b9 b8 b7 b6 */
                word3 = word2 & word12;
                word2 &= mask;
                word3 &= (~mask);  /* 0x1010101, check last bit */
                word12 &= mask;
                word2 >>= 1;
                word2 = word2 + (word12 >> 1);
                word2 += word3;
                *((uint32*)(pred_block += 4)) = word2; /* write 4 pixels */

                prev += offset;
            }
            return 1;
        } /* rnd1 */
    }
    else if (tmp == 1)
    {
        prev--; /* word-aligned */
        if (rnd1 == 1)
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word1 = *((uint32*)prev); /* b3 b2 b1 b0 */
                word2 = *((uint32*)(prev += 4)); /* b7 b6 b5 b4 */
                word12 = (word1 >> 8); /* 0 b3 b2 b1 */
                word1 >>= 16; /* 0 0 b3 b2 */
                word12 |= (word2 << 24); /* b4 b3 b2 b1 */
                word1 |= (word2 << 16); /* b5 b4 b3 b2 */
                word3 = word1 | word12; // rnd1 = 1; otherwise word3 = word1&word12
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word1 >>= 1;
                word1 = word1 + (word12 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += offset2)) = word1; /* write 4 pixels */

                word1 = *((uint32*)(prev += 4)); /* b11 b10 b9 b8 */
                word12 = (word2 >> 8); /* 0 b7 b6 b5 */
                word2 >>= 16; /* 0 0 b7 b6 */
                word12 |= (word1 << 24); /* b8 b7 b6 b5 */
                word2 |= (word1 << 16); /* b9 b8 b7 b6 */
                word3 = word2 | word12; // rnd1 = 1; otherwise word3 = word2&word12
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word2 >>= 1;
                word2 = word2 + (word12 >> 1);
                word2 += word3;
                *((uint32*)(pred_block += 4)) = word2; /* write 4 pixels */

                prev += offset;
            }
            return 1;
        }
        else /* rnd1 = 0 */
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word1 = *((uint32*)prev); /* b3 b2 b1 b0 */

                word2 = *((uint32*)(prev += 4)); /* b7 b6 b5 b4 */
                word12 = (word1 >> 8); /* 0 b3 b2 b1 */
                word1 >>= 16; /* 0 0 b3 b2 */
                word12 |= (word2 << 24); /* b4 b3 b2 b1 */
                word1 |= (word2 << 16); /* b5 b4 b3 b2 */
                word3 = word1 & word12;
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word1 >>= 1;
                word1 = word1 + (word12 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += offset2)) = word1; /* write 4 pixels */

                word1 = *((uint32*)(prev += 4)); /* b11 b10 b9 b8 */
                word12 = (word2 >> 8); /* 0 b7 b6 b5 */
                word2 >>= 16; /* 0 0 b7 b6 */
                word12 |= (word1 << 24); /* b8 b7 b6 b5 */
                word2 |= (word1 << 16); /* b9 b8 b7 b6 */
                word3 = word2 & word12;
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word2 >>= 1;
                word2 = word2 + (word12 >> 1);
                word2 += word3;
                *((uint32*)(pred_block += 4)) = word2; /* write 4 pixels */

                prev += offset;
            }
            return 1;
        } /* rnd1 */
    }
    else if (tmp == 2)
    {
        prev -= 2; /* word-aligned */
        if (rnd1 == 1)
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word1 = *((uint32*)prev); /* b2 b1 b0 bN1 */
                word2 = *((uint32*)(prev += 4)); /* b6 b5 b4 b3 */
                word12 = (word1 >> 16); /* 0 0 b2 b1 */
                word1 >>= 24; /* 0 0 0 b2 */
                word12 |= (word2 << 16); /* b4 b3 b2 b1 */
                word1 |= (word2 << 8); /* b5 b4 b3 b2 */
                word3 = word1 | word12; // rnd1 = 1; otherwise word3 = word1&word12
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word1 >>= 1;
                word1 = word1 + (word12 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += offset2)) = word1; /* write 4 pixels */

                word1 = *((uint32*)(prev += 4)); /* b10 b9 b8 b7 */
                word12 = (word2 >> 16); /* 0 0 b6 b5 */
                word2 >>= 24; /* 0 0 0 b6 */
                word12 |= (word1 << 16); /* b8 b7 b6 b5 */
                word2 |= (word1 << 8); /* b9 b8 b7 b6 */
                word3 = word2 | word12; // rnd1 = 1; otherwise word3 = word1&word12
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word2 >>= 1;
                word2 = word2 + (word12 >> 1);
                word2 += word3;
                *((uint32*)(pred_block += 4)) = word2; /* write 4 pixels */
                prev += offset;
            }
            return 1;
        }
        else /* rnd1 == 0 */
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word1 = *((uint32*)prev); /* b2 b1 b0 bN1 */
                word2 = *((uint32*)(prev += 4)); /* b6 b5 b4 b3 */
                word12 = (word1 >> 16); /* 0 0 b2 b1 */
                word1 >>= 24; /* 0 0 0 b2 */
                word12 |= (word2 << 16); /* b4 b3 b2 b1 */
                word1 |= (word2 << 8); /* b5 b4 b3 b2 */
                word3 = word1 & word12; // rnd1 = 1; otherwise word3 = word1&word12
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word1 >>= 1;
                word1 = word1 + (word12 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += offset2)) = word1; /* write 4 pixels */

                word1 = *((uint32*)(prev += 4)); /* b10 b9 b8 b7 */
                word12 = (word2 >> 16); /* 0 0 b6 b5 */
                word2 >>= 24; /* 0 0 0 b6 */
                word12 |= (word1 << 16); /* b8 b7 b6 b5 */
                word2 |= (word1 << 8); /* b9 b8 b7 b6 */
                word3 = word2 & word12; // rnd1 = 1; otherwise word3 = word1&word12
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word2 >>= 1;
                word2 = word2 + (word12 >> 1);
                word2 += word3;
                *((uint32*)(pred_block += 4)) = word2; /* write 4 pixels */
                prev += offset;
            }
            return 1;
        }
    }
    else /* tmp = 3 */
    {
        prev -= 3; /* word-aligned */
        if (rnd1 == 1)
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word1 = *((uint32*)prev); /* b1 b0 bN1 bN2 */
                word2 = *((uint32*)(prev += 4)); /* b5 b4 b3 b2 */
                word12 = (word1 >> 24); /* 0 0 0 b1 */
                word12 |= (word2 << 8); /* b4 b3 b2 b1 */
                word1 = word2;
                word3 = word1 | word12; // rnd1 = 1; otherwise word3 = word1&word12
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word1 >>= 1;
                word1 = word1 + (word12 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += offset2)) = word1; /* write 4 pixels */

                word1 = *((uint32*)(prev += 4)); /* b9 b8 b7 b6 */
                word12 = (word2 >> 24); /* 0 0 0 b5 */
                word12 |= (word1 << 8); /* b8 b7 b6 b5 */
                word2 = word1; /* b9 b8 b7 b6 */
                word3 = word2 | word12; // rnd1 = 1; otherwise word3 = word1&word12
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word2 >>= 1;
                word2 = word2 + (word12 >> 1);
                word2 += word3;
                *((uint32*)(pred_block += 4)) = word2; /* write 4 pixels */
                prev += offset;
            }
            return 1;
        }
        else
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word1 = *((uint32*)prev); /* b1 b0 bN1 bN2 */
                word2 = *((uint32*)(prev += 4)); /* b5 b4 b3 b2 */
                word12 = (word1 >> 24); /* 0 0 0 b1 */
                word12 |= (word2 << 8); /* b4 b3 b2 b1 */
                word1 = word2;
                word3 = word1 & word12; // rnd1 = 1; otherwise word3 = word1&word12
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word1 >>= 1;
                word1 = word1 + (word12 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += offset2)) = word1; /* write 4 pixels */

                word1 = *((uint32*)(prev += 4)); /* b9 b8 b7 b6 */
                word12 = (word2 >> 24); /* 0 0 0 b5 */
                word12 |= (word1 << 8); /* b8 b7 b6 b5 */
                word2 = word1; /* b9 b8 b7 b6 */
                word3 = word2 & word12; // rnd1 = 1; otherwise word3 = word1&word12
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 &= mask;
                word2 >>= 1;
                word2 = word2 + (word12 >> 1);
                word2 += word3;
                *((uint32*)(pred_block += 4)) = word2; /* write 4 pixels */
                prev += offset;
            }
            return 1;
        }
    }
}

/**************************************************************************/
int GetPredAdvancedBy1x0(
    uint8 *prev,        /* i */
    uint8 *pred_block,      /* i */
    int width,      /* i */
    int pred_width_rnd /* i */
)
{
    uint    i;      /* loop variable */
    int offset, offset2;
    uint32  word1, word2, word3, word12, word22;
    int tmp;
    int rnd1;
    uint32 mask;

    /* initialize offset to adjust pixel counter */
    /*    the next row; full-pel resolution      */
    offset = width - B_SIZE; /* offset for prev */
    offset2 = (pred_width_rnd >> 1) - 4; /* offset for pred_block */

    rnd1 = pred_width_rnd & 1;

    /* Branch based on pixel location (half-pel or full-pel) for x and y */
    pred_block -= offset2; /* preset */

    tmp = (uintptr_t)prev & 3;
    mask = 254;
    mask |= (mask << 8);
    mask |= (mask << 16); /* 0xFEFEFEFE */

    if (tmp == 0) /* word-aligned */
    {
        prev -= 4;
        if (rnd1 == 1)
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word1 = *((uint32*)(prev += 4));
                word2 = *((uint32*)(prev + width));
                word3 = word1 | word2; // rnd1 = 1; otherwise word3 = word1&word2
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word2 &= mask;
                word1 >>= 1;
                word1 = word1 + (word2 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += offset2)) = word1;
                word1 = *((uint32*)(prev += 4));
                word2 = *((uint32*)(prev + width));
                word3 = word1 | word2; // rnd1 = 1; otherwise word3 = word1&word2
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word2 &= mask;
                word1 >>= 1;
                word1 = word1 + (word2 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += 4)) = word1;

                prev += offset;
            }
            return 1;
        }
        else   /* rnd1 = 0 */
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word1 = *((uint32*)(prev += 4));
                word2 = *((uint32*)(prev + width));
                word3 = word1 & word2;  /* rnd1 = 0; */
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word2 &= mask;
                word1 >>= 1;
                word1 = word1 + (word2 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += offset2)) = word1;
                word1 = *((uint32*)(prev += 4));
                word2 = *((uint32*)(prev + width));
                word3 = word1 & word2;  /* rnd1 = 0; */
                word1 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word2 &= mask;
                word1 >>= 1;
                word1 = word1 + (word2 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += 4)) = word1;

                prev += offset;
            }
            return 1;
        }
    }
    else if (tmp == 1)
    {
        prev--; /* word-aligned */
        if (rnd1 == 1)
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word12 = *((uint32*)prev); /* read b4 b3 b2 b1 */
                word22 = *((uint32*)(prev + width));

                word1 = *((uint32*)(prev += 4)); /* read b8 b7 b6 b5 */
                word2 = *((uint32*)(prev + width));
                word12 >>= 8; /* 0 b4 b3 b2 */
                word22 >>= 8;
                word12 = word12 | (word1 << 24); /* b5 b4 b3 b2 */
                word22 = word22 | (word2 << 24);
                word3 = word12 | word22;
                word12 &= mask;
                word22 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 >>= 1;
                word12 = word12 + (word22 >> 1);
                word12 += word3;
                *((uint32*)(pred_block += offset2)) = word12;

                word12 = *((uint32*)(prev += 4)); /* read b12 b11 b10 b9 */
                word22 = *((uint32*)(prev + width));
                word1 >>= 8; /* 0 b8 b7 b6 */
                word2 >>= 8;
                word1 = word1 | (word12 << 24); /* b9 b8 b7 b6 */
                word2 = word2 | (word22 << 24);
                word3 = word1 | word2;
                word1 &= mask;
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word1 >>= 1;
                word1 = word1 + (word2 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += 4)) = word1;
                prev += offset;
            }
            return 1;
        }
        else /* rnd1 = 0 */
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word12 = *((uint32*)prev); /* read b4 b3 b2 b1 */
                word22 = *((uint32*)(prev + width));

                word1 = *((uint32*)(prev += 4)); /* read b8 b7 b6 b5 */
                word2 = *((uint32*)(prev + width));
                word12 >>= 8; /* 0 b4 b3 b2 */
                word22 >>= 8;
                word12 = word12 | (word1 << 24); /* b5 b4 b3 b2 */
                word22 = word22 | (word2 << 24);
                word3 = word12 & word22;
                word12 &= mask;
                word22 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 >>= 1;
                word12 = word12 + (word22 >> 1);
                word12 += word3;
                *((uint32*)(pred_block += offset2)) = word12;

                word12 = *((uint32*)(prev += 4)); /* read b12 b11 b10 b9 */
                word22 = *((uint32*)(prev + width));
                word1 >>= 8; /* 0 b8 b7 b6 */
                word2 >>= 8;
                word1 = word1 | (word12 << 24); /* b9 b8 b7 b6 */
                word2 = word2 | (word22 << 24);
                word3 = word1 & word2;
                word1 &= mask;
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word1 >>= 1;
                word1 = word1 + (word2 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += 4)) = word1;
                prev += offset;
            }
            return 1;
        }
    }
    else if (tmp == 2)
    {
        prev -= 2; /* word-aligned */
        if (rnd1 == 1)
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word12 = *((uint32*)prev); /* read b4 b3 b2 b1 */
                word22 = *((uint32*)(prev + width));

                word1 = *((uint32*)(prev += 4)); /* read b8 b7 b6 b5 */
                word2 = *((uint32*)(prev + width));
                word12 >>= 16; /* 0 0 b4 b3 */
                word22 >>= 16;
                word12 = word12 | (word1 << 16); /* b6 b5 b4 b3 */
                word22 = word22 | (word2 << 16);
                word3 = word12 | word22;
                word12 &= mask;
                word22 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 >>= 1;
                word12 = word12 + (word22 >> 1);
                word12 += word3;
                *((uint32*)(pred_block += offset2)) = word12;

                word12 = *((uint32*)(prev += 4)); /* read b12 b11 b10 b9 */
                word22 = *((uint32*)(prev + width));
                word1 >>= 16; /* 0 0 b8 b7 */
                word2 >>= 16;
                word1 = word1 | (word12 << 16); /* b10 b9 b8 b7 */
                word2 = word2 | (word22 << 16);
                word3 = word1 | word2;
                word1 &= mask;
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word1 >>= 1;
                word1 = word1 + (word2 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += 4)) = word1;
                prev += offset;
            }
            return 1;
        }
        else /* rnd1 = 0 */
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word12 = *((uint32*)prev); /* read b4 b3 b2 b1 */
                word22 = *((uint32*)(prev + width));

                word1 = *((uint32*)(prev += 4)); /* read b8 b7 b6 b5 */
                word2 = *((uint32*)(prev + width));
                word12 >>= 16; /* 0 0 b4 b3 */
                word22 >>= 16;
                word12 = word12 | (word1 << 16); /* b6 b5 b4 b3 */
                word22 = word22 | (word2 << 16);
                word3 = word12 & word22;
                word12 &= mask;
                word22 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 >>= 1;
                word12 = word12 + (word22 >> 1);
                word12 += word3;
                *((uint32*)(pred_block += offset2)) = word12;

                word12 = *((uint32*)(prev += 4)); /* read b12 b11 b10 b9 */
                word22 = *((uint32*)(prev + width));
                word1 >>= 16; /* 0 0 b8 b7 */
                word2 >>= 16;
                word1 = word1 | (word12 << 16); /* b10 b9 b8 b7 */
                word2 = word2 | (word22 << 16);
                word3 = word1 & word2;
                word1 &= mask;
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word1 >>= 1;
                word1 = word1 + (word2 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += 4)) = word1;
                prev += offset;
            }

            return 1;
        }
    }
    else /* tmp == 3 */
    {
        prev -= 3; /* word-aligned */
        if (rnd1 == 1)
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word12 = *((uint32*)prev); /* read b4 b3 b2 b1 */
                word22 = *((uint32*)(prev + width));

                word1 = *((uint32*)(prev += 4)); /* read b8 b7 b6 b5 */
                word2 = *((uint32*)(prev + width));
                word12 >>= 24; /* 0 0 0 b4 */
                word22 >>= 24;
                word12 = word12 | (word1 << 8); /* b7 b6 b5 b4 */
                word22 = word22 | (word2 << 8);
                word3 = word12 | word22;
                word12 &= mask;
                word22 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 >>= 1;
                word12 = word12 + (word22 >> 1);
                word12 += word3;
                *((uint32*)(pred_block += offset2)) = word12;

                word12 = *((uint32*)(prev += 4)); /* read b12 b11 b10 b9 */
                word22 = *((uint32*)(prev + width));
                word1 >>= 24; /* 0 0 0 b8 */
                word2 >>= 24;
                word1 = word1 | (word12 << 8); /* b11 b10 b9 b8 */
                word2 = word2 | (word22 << 8);
                word3 = word1 | word2;
                word1 &= mask;
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word1 >>= 1;
                word1 = word1 + (word2 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += 4)) = word1;
                prev += offset;
            }
            return 1;
        }
        else /* rnd1 = 0 */
        {
            for (i = B_SIZE; i > 0; i--)
            {
                word12 = *((uint32*)prev); /* read b4 b3 b2 b1 */
                word22 = *((uint32*)(prev + width));

                word1 = *((uint32*)(prev += 4)); /* read b8 b7 b6 b5 */
                word2 = *((uint32*)(prev + width));
                word12 >>= 24; /* 0 0 0 b4 */
                word22 >>= 24;
                word12 = word12 | (word1 << 8); /* b7 b6 b5 b4 */
                word22 = word22 | (word2 << 8);
                word3 = word12 & word22;
                word12 &= mask;
                word22 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word12 >>= 1;
                word12 = word12 + (word22 >> 1);
                word12 += word3;
                *((uint32*)(pred_block += offset2)) = word12;

                word12 = *((uint32*)(prev += 4)); /* read b12 b11 b10 b9 */
                word22 = *((uint32*)(prev + width));
                word1 >>= 24; /* 0 0 0 b8 */
                word2 >>= 24;
                word1 = word1 | (word12 << 8); /* b11 b10 b9 b8 */
                word2 = word2 | (word22 << 8);
                word3 = word1 & word2;
                word1 &= mask;
                word2 &= mask;
                word3 &= (~mask); /* 0x1010101, check last bit */
                word1 >>= 1;
                word1 = word1 + (word2 >> 1);
                word1 += word3;
                *((uint32*)(pred_block += 4)) = word1;
                prev += offset;
            }
            return 1;
        } /* rnd */
    } /* tmp */
}

/**********************************************************************************/
int GetPredAdvancedBy1x1(
    uint8 *prev,        /* i */
    uint8 *pred_block,      /* i */
    int width,      /* i */
    int pred_width_rnd /* i */
)
{
    uint    i;      /* loop variable */
    int offset, offset2;
    uint32  x1, x2, x1m, x2m, y1, y2, y1m, y2m; /* new way */
    int tmp;
    int rnd1, rnd2;
    uint32 mask;

    /* initialize offset to adjust pixel counter */
    /*    the next row; full-pel resolution      */
    offset = width - B_SIZE; /* offset for prev */
    offset2 = (pred_width_rnd >> 1) - 8; /* offset for pred_block */

    rnd1 = pred_width_rnd & 1;

    rnd2 = rnd1 + 1;
    rnd2 |= (rnd2 << 8);
    rnd2 |= (rnd2 << 16);

    mask = 0x3F;
    mask |= (mask << 8);
    mask |= (mask << 16); /* 0x3f3f3f3f */

    tmp = (uintptr_t)prev & 3;

    pred_block -= 4; /* preset */

    if (tmp == 0) /* word-aligned */
    {
        for (i = B_SIZE; i > 0; i--)
        {
            x1 = *((uint32*)prev); /* load a3 a2 a1 a0 */
            x2 = *((uint32*)(prev + width)); /* load b3 b2 b1 b0, another line */
            y1 = *((uint32*)(prev += 4)); /* a7 a6 a5 a4 */
            y2 = *((uint32*)(prev + width)); /* b7 b6 b5 b4 */

            x1m = (x1 >> 2) & mask; /* zero out last 2 bits */
            x2m = (x2 >> 2) & mask;
            x1 = x1 ^(x1m << 2);
            x2 = x2 ^(x2m << 2);
            x1m += x2m;
            x1 += x2;

            /* x2m, x2 free */
            y1m = (y1 >> 2) & mask; /* zero out last 2 bits */
            y2m = (y2 >> 2) & mask;
            y1 = y1 ^(y1m << 2);
            y2 = y2 ^(y2m << 2);
            y1m += y2m;
            y1 += y2;

            /* y2m, y2 free */
            /* x2m, x2 free */
            x2 = *((uint32*)(prev += 4)); /* a11 a10 a9 a8 */
            y2 = *((uint32*)(prev + width)); /* b11 b10 b9 b8 */
            x2m = (x2 >> 2) & mask;
            y2m = (y2 >> 2) & mask;
            x2 = x2 ^(x2m << 2);
            y2 = y2 ^(y2m << 2);
            x2m += y2m;
            x2 += y2;
            /* y2m, y2 free */

            /* now operate on x1m, x1, y1m, y1, x2m, x2 */
            /* x1m = a3+b3, a2+b2, a1+b1, a0+b0 */
            /* y1m = a7+b7, a6+b6, a5+b5, a4+b4 */
            /* x2m = a11+b11, a10+b10, a9+b9, a8+b8 */
            /* x1, y1, x2 */

            y2m = x1m >> 8;
            y2 = x1 >> 8;
            y2m |= (y1m << 24);  /* a4+b4, a3+b3, a2+b2, a1+b1 */
            y2 |= (y1 << 24);
            x1m += y2m;  /* a3+b3+a4+b4, ....., a0+b0+a1+b1 */
            x1 += y2;
            x1 += rnd2;
            x1 &= (mask << 2);
            x1m += (x1 >> 2);
            *((uint32*)(pred_block += 4)) = x1m; /* save x1m */

            y2m = y1m >> 8;
            y2 = y1 >> 8;
            y2m |= (x2m << 24); /* a8+b8, a7+b7, a6+b6, a5+b5 */
            y2 |= (x2 << 24);
            y1m += y2m;  /* a7+b7+a8+b8, ....., a4+b4+a5+b5 */
            y1 += y2;
            y1 += rnd2;
            y1 &= (mask << 2);
            y1m += (y1 >> 2);
            *((uint32*)(pred_block += 4)) = y1m; /* save y1m */

            pred_block += offset2;
            prev += offset;
        }

        return 1;
    }
    else if (tmp == 1)
    {
        prev--; /* to word-aligned */
        for (i = B_SIZE; i > 0; i--)
        {
            x1 = *((uint32*)prev); /* load a3 a2 a1 a0 */
            x2 = *((uint32*)(prev + width)); /* load b3 b2 b1 b0, another line */
            y1 = *((uint32*)(prev += 4)); /* a7 a6 a5 a4 */
            y2 = *((uint32*)(prev + width)); /* b7 b6 b5 b4 */

            x1m = (x1 >> 2) & mask; /* zero out last 2 bits */
            x2m = (x2 >> 2) & mask;
            x1 = x1 ^(x1m << 2);
            x2 = x2 ^(x2m << 2);
            x1m += x2m;
            x1 += x2;

            /* x2m, x2 free */
            y1m = (y1 >> 2) & mask; /* zero out last 2 bits */
            y2m = (y2 >> 2) & mask;
            y1 = y1 ^(y1m << 2);
            y2 = y2 ^(y2m << 2);
            y1m += y2m;
            y1 += y2;

            /* y2m, y2 free */
            /* x2m, x2 free */
            x2 = *((uint32*)(prev += 4)); /* a11 a10 a9 a8 */
            y2 = *((uint32*)(prev + width)); /* b11 b10 b9 b8 */
            x2m = (x2 >> 2) & mask;
            y2m = (y2 >> 2) & mask;
            x2 = x2 ^(x2m << 2);
            y2 = y2 ^(y2m << 2);
            x2m += y2m;
            x2 += y2;
            /* y2m, y2 free */

            /* now operate on x1m, x1, y1m, y1, x2m, x2 */
            /* x1m = a3+b3, a2+b2, a1+b1, a0+b0 */
            /* y1m = a7+b7, a6+b6, a5+b5, a4+b4 */
            /* x2m = a11+b11, a10+b10, a9+b9, a8+b8 */
            /* x1, y1, x2 */

            x1m >>= 8 ;
            x1 >>= 8;
            x1m |= (y1m << 24);  /* a4+b4, a3+b3, a2+b2, a1+b1 */
            x1 |= (y1 << 24);
            y2m = (y1m << 16);
            y2 = (y1 << 16);
            y2m |= (x1m >> 8); /* a5+b5, a4+b4, a3+b3, a2+b2 */
            y2 |= (x1 >> 8);
            x1 += rnd2;
            x1m += y2m;  /* a4+b4+a5+b5, ....., a1+b1+a2+b2 */
            x1 += y2;
            x1 &= (mask << 2);
            x1m += (x1 >> 2);
            *((uint32*)(pred_block += 4)) = x1m; /* save x1m */

            y1m >>= 8;
            y1 >>= 8;
            y1m |= (x2m << 24); /* a8+b8, a7+b7, a6+b6, a5+b5 */
            y1 |= (x2 << 24);
            y2m = (x2m << 16);
            y2 = (x2 << 16);
            y2m |= (y1m >> 8); /*  a9+b9, a8+b8, a7+b7, a6+b6,*/
            y2 |= (y1 >> 8);
            y1 += rnd2;
            y1m += y2m;  /* a8+b8+a9+b9, ....., a5+b5+a6+b6 */
            y1 += y2;
            y1 &= (mask << 2);
            y1m += (y1 >> 2);
            *((uint32*)(pred_block += 4)) = y1m; /* save y1m */

            pred_block += offset2;
            prev += offset;
        }
        return 1;
    }
    else if (tmp == 2)
    {
        prev -= 2; /* to word-aligned */
        for (i = B_SIZE; i > 0; i--)
        {
            x1 = *((uint32*)prev); /* load a3 a2 a1 a0 */
            x2 = *((uint32*)(prev + width)); /* load b3 b2 b1 b0, another line */
            y1 = *((uint32*)(prev += 4)); /* a7 a6 a5 a4 */
            y2 = *((uint32*)(prev + width)); /* b7 b6 b5 b4 */

            x1m = (x1 >> 2) & mask; /* zero out last 2 bits */
            x2m = (x2 >> 2) & mask;
            x1 = x1 ^(x1m << 2);
            x2 = x2 ^(x2m << 2);
            x1m += x2m;
            x1 += x2;

            /* x2m, x2 free */
            y1m = (y1 >> 2) & mask; /* zero out last 2 bits */
            y2m = (y2 >> 2) & mask;
            y1 = y1 ^(y1m << 2);
            y2 = y2 ^(y2m << 2);
            y1m += y2m;
            y1 += y2;

            /* y2m, y2 free */
            /* x2m, x2 free */
            x2 = *((uint32*)(prev += 4)); /* a11 a10 a9 a8 */
            y2 = *((uint32*)(prev + width)); /* b11 b10 b9 b8 */
            x2m = (x2 >> 2) & mask;
            y2m = (y2 >> 2) & mask;
            x2 = x2 ^(x2m << 2);
            y2 = y2 ^(y2m << 2);
            x2m += y2m;
            x2 += y2;
            /* y2m, y2 free */

            /* now operate on x1m, x1, y1m, y1, x2m, x2 */
            /* x1m = a3+b3, a2+b2, a1+b1, a0+b0 */
            /* y1m = a7+b7, a6+b6, a5+b5, a4+b4 */
            /* x2m = a11+b11, a10+b10, a9+b9, a8+b8 */
            /* x1, y1, x2 */

            x1m >>= 16 ;
            x1 >>= 16;
            x1m |= (y1m << 16);  /* a5+b5, a4+b4, a3+b3, a2+b2 */
            x1 |= (y1 << 16);
            y2m = (y1m << 8);
            y2 = (y1 << 8);
            y2m |= (x1m >> 8); /* a6+b6, a5+b5, a4+b4, a3+b3 */
            y2 |= (x1 >> 8);
            x1 += rnd2;
            x1m += y2m;  /* a5+b5+a6+b6, ....., a2+b2+a3+b3 */
            x1 += y2;
            x1 &= (mask << 2);
            x1m += (x1 >> 2);
            *((uint32*)(pred_block += 4)) = x1m; /* save x1m */

            y1m >>= 16;
            y1 >>= 16;
            y1m |= (x2m << 16); /* a9+b9, a8+b8, a7+b7, a6+b6 */
            y1 |= (x2 << 16);
            y2m = (x2m << 8);
            y2 = (x2 << 8);
            y2m |= (y1m >> 8); /*  a10+b10, a9+b9, a8+b8, a7+b7,*/
            y2 |= (y1 >> 8);
            y1 += rnd2;
            y1m += y2m;  /* a9+b9+a10+b10, ....., a6+b6+a7+b7 */
            y1 += y2;
            y1 &= (mask << 2);
            y1m += (y1 >> 2);
            *((uint32*)(pred_block += 4)) = y1m; /* save y1m */

            pred_block += offset2;
            prev += offset;
        }
        return 1;
    }
    else /* tmp == 3 */
    {
        prev -= 3; /* to word-aligned */
        for (i = B_SIZE; i > 0; i--)
        {
            x1 = *((uint32*)prev); /* load a3 a2 a1 a0 */
            x2 = *((uint32*)(prev + width)); /* load b3 b2 b1 b0, another line */
            y1 = *((uint32*)(prev += 4)); /* a7 a6 a5 a4 */
            y2 = *((uint32*)(prev + width)); /* b7 b6 b5 b4 */

            x1m = (x1 >> 2) & mask; /* zero out last 2 bits */
            x2m = (x2 >> 2) & mask;
            x1 = x1 ^(x1m << 2);
            x2 = x2 ^(x2m << 2);
            x1m += x2m;
            x1 += x2;

            /* x2m, x2 free */
            y1m = (y1 >> 2) & mask; /* zero out last 2 bits */
            y2m = (y2 >> 2) & mask;
            y1 = y1 ^(y1m << 2);
            y2 = y2 ^(y2m << 2);
            y1m += y2m;
            y1 += y2;

            /* y2m, y2 free */
            /* x2m, x2 free */
            x2 = *((uint32*)(prev += 4)); /* a11 a10 a9 a8 */
            y2 = *((uint32*)(prev + width)); /* b11 b10 b9 b8 */
            x2m = (x2 >> 2) & mask;
            y2m = (y2 >> 2) & mask;
            x2 = x2 ^(x2m << 2);
            y2 = y2 ^(y2m << 2);
            x2m += y2m;
            x2 += y2;
            /* y2m, y2 free */

            /* now operate on x1m, x1, y1m, y1, x2m, x2 */
            /* x1m = a3+b3, a2+b2, a1+b1, a0+b0 */
            /* y1m = a7+b7, a6+b6, a5+b5, a4+b4 */
            /* x2m = a11+b11, a10+b10, a9+b9, a8+b8 */
            /* x1, y1, x2 */

            x1m >>= 24 ;
            x1 >>= 24;
            x1m |= (y1m << 8);  /* a6+b6, a5+b5, a4+b4, a3+b3 */
            x1 |= (y1 << 8);

            x1m += y1m;  /* a6+b6+a7+b7, ....., a3+b3+a4+b4 */
            x1 += y1;
            x1 += rnd2;
            x1 &= (mask << 2);
            x1m += (x1 >> 2);
            *((uint32*)(pred_block += 4)) = x1m; /* save x1m */

            y1m >>= 24;
            y1 >>= 24;
            y1m |= (x2m << 8); /* a10+b10, a9+b9, a8+b8, a7+b7 */
            y1 |= (x2 << 8);
            y1m += x2m;  /* a10+b10+a11+b11, ....., a7+b7+a8+b8 */
            y1 += x2;
            y1 += rnd2;
            y1 &= (mask << 2);
            y1m += (y1 >> 2);
            *((uint32*)(pred_block += 4)) = y1m; /* save y1m */

            pred_block += offset2;
            prev += offset;
        }
        return 1;
    }
}


